home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Gold Medal Software 2
/
Gold Medal Software Volume 2 (Gold Medal) (1994).iso
/
archive
/
ntzip.arj
/
IM_CTREE.C
< prev
next >
Wrap
C/C++ Source or Header
|
1991-10-03
|
46KB
|
1,402 lines
/*
Copyright (C) 1990,1991 Mark Adler, Richard B. Wales, and Jean-loup Gailly.
Permission is granted to any individual or institution to use, copy, or
redistribute this software so long as all of the original files are included
unmodified, that it is not sold for profit, and that this copyright notice
is retained.
*/
/*
* im_ctree.c by Richard B. Wales.
*
* Includes modifications by Jean-Loup Gailly.
*
* PURPOSE
*
* Encode various sets of source values using variable-length
* binary code trees.
*
* DISCUSSION
*
* The PKZIP "implosion" process uses several variable-depth binary
* code trees, similar to Huffman trees. The more common source
* values are represented by shorter bit sequences.
*
* Each code tree is stored in the ZIP file in a compressed form
* that is essentially a run-length-encoded list of the lengths of
* all the code strings (in ascending order by source values).
* The actual code strings are reconstructed from the lengths in
* the UNZIP process, as described in the "application note"
* (APPNOTE.TXT) distributed as part of PKWARE's PKZIP program.
*
* Because of the way a code tree is stored in the ZIP file, the
* codes must conform to the following restrictions:
*
* (1) No code string may ever exceed 16 bits in length.
*
* (2) If all code strings are extended to 16 bits by padding on
* the right (low-order end) with zeros, and then treated as
* unsigned 16-bit integers, then:
*
* (a) The arithmetically higher 16-bit values must correspond
* to the shorter code strings.
*
* (b) If two source values map into code strings of the same
* length, the higher code string must correspond to the
* lower source value (where source values are treated as
* unsigned).
*
* Further, shortcuts taken by PKUNZIP 1.10 in the way it decodes
* the compressed data via the trees impose the following extra
* limitations:
*
* (3) No code string in the "distance" tree can be longer than 8
* bits.
*
* (4) The maximum length of any code string is limited by the
* number of initial zero bits, as follows:
*
* (a) 0-3 leading zeros: maximum length is 8.
*
* (b) 4-5 leading zeros: maximum length is 12.
*
* (c) 6-7 leading zeros: maximum length is 14.
*
* (5) In the "literal" tree, the code string corresponding to the
* source value 255 must be at least 10 bits in length, regard-
* less of the frequency of the value 255 in the file.
*
* PKWARE calls the code trees "Shannon-Fano trees". (Shannon-Fano
* coding was a predecessor to the better-known Huffman coding
* technique; see the references below.) Although it appears that
* the Shannon-Fano (top-down partitioning) algorithm is in fact
* used by PKZIP in the process of creating the code trees, the
* resulting trees are not in fact "pure" Shannon-Fano, because of
* the extra processing required in order to meet the restrictions
* described above.
*
* REFERENCES
*
* Lynch, Thomas J.
* Data Compression: Techniques and Applications, pp. 53-55.
* Lifetime Learning Publications, 1985. ISBN 0-534-03418-7.
*
* Storer, James A.
* Data Compression: Methods and Theory, pp. 49-50.
* Computer Science Press, 1988. ISBN 0-7167-8156-5.
*
* INTERFACE
*
* ImpErr ct_init (void)
* Initialize the code tree routines. This routine must be
* called before any other code tree routines may be called.
*
* ImpErr ct_tally (MATCH *ma)
* Tally a "string match" data set. The tally results will be
* used to determine how large the imploded result will be.
*
* ImpErr ct_mktrees (FDATA *fd)
* Construct all code trees, and then determine how big the
* imploded file will be under both "literal tree" and "no
* literal tree" conditions. Choose the best option.
*
* ImpErr ct_wrtrees (FILE *outfp)
* Output the code trees to the ZIP file.
*
* ImpErr ct_wrdata (FILE *outfp)
* Output the file data to the ZIP file.
*
* ImpErr ct_windup (void)
* Deallocate all code trees.
*/
#ifdef DEBUG
# define VALIDATE
#endif /* DEBUG */
#include "implode.h"
/***********************************************************************
*
* Local data used by the "code tree" routines.
*/
/* Data structure describing a single value and its code string. */
typedef
struct ct_data
{ UL_INT ct_freq; /* frequency count */
US_INT ct_code; /* bit string */
U_CHAR ct_len; /* length of bit string */
U_CHAR ct_val; /* source value */
}
TRDATA;
/*
* Data structure for re-sorting source values by bit string length;
* used during tree generation.
*/
typedef
struct ct_resort
{ U_CHAR ct_rlen; /* length of bit string */
U_CHAR ct_rval; /* source value */
#ifdef VMS
US_INT dummy; /* because of bug in qsort() */
#endif /* VMS */
}
RESORT;
/* Header for a code tree. */
typedef
struct ct_desc
{ TRDATA *ct_array; /* array of TRDATA */
int ct_size; /* # of entries in tree */
}
TRDESC;
/*
* Currently active code trees.
* We allow for five trees (one literal tree, two length trees, and
* two distance trees) so that we can evaluate the total compression
* with or without the use of the literal tree. Remember that the
* minimum matching distance depends on whether or not a literal tree
* is used; hence, the "length" and "distance" trees will differ.
*/
#define MAXTREES 5 /* max # of trees at once */
local TRDESC ct_table[MAXTREES];
/* Macro to validate a code tree handle. */
#define VALID_HANDLE(x) \
((x) >= 0 && (x) < MAXTREES && ct_table[x].ct_array != NULL)
/*
* Assorted frequency counts.
* The "Minimum Match Length" (MML) is 2 if a "literal character" code
* tree is not used, or 3 if a "literal character" tree is used. Thus,
* we need to keep two sets of "length" and "distance" frequency counts.
* Also, 2-character matches need to be counted separately; depending
* on whether a "literal character" tree is used or not, these will be
* processed either as literal characters or as distance/length pairs.
*/
/* Total number of source values from Pass One. */
local long ct_litc_num; /* total # literal chars */
local long ct_lit2_num; /* total # of 2-char matches */
local long ct_strg_num; /* total # of string matches */
/* Source value frequencies for the five trees. */
local long ct_litc_freq[256]; /* literal character freqs */
local long ct_len2_freq[64]; /* length freqs (MML=2) */
local long ct_len3_freq[64]; /* length freqs (MML=3) */
local long ct_dst2_freq[64]; /* distance freqs (MML=2) */
local long ct_dst3_freq[64]; /* distance freqs (MML=3) */
/* Number of bits saved by using each of the trees. */
local long ct_litc_saved; /* literal tree */
local long ct_len2_saved; /* length tree (MML=2) */
local long ct_len3_saved; /* length tree (MML=3) */
local long ct_dst2_saved; /* distance tree (MML=2) */
local long ct_dst3_saved; /* distance tree (MML=3) */
/* Handles for the code trees to be used. */
local int ct_litc_tree; /* temp literal tree */
local int ct_len2_tree; /* temp length tree (MML=2) */
local int ct_len3_tree; /* temp length tree (MML=3) */
local int ct_dst2_tree; /* temp distance tree (MML=2) */
local int ct_dst3_tree; /* temp distance tree (MML=3) */
local int lit_tree; /* literal tree (-1 if none) */
local int len_tree; /* length tree */
local int dst_tree; /* distance tree */
/***********************************************************************
*
* Local (static) routines in this file.
*/
local ImpErr ct_alloc
OF ((int size, int *handle));
local ImpErr ct_free
OF ((int handle));
local ImpErr ct_loadf
OF ((int handle, long *freq));
local ImpErr ct_ziprep
OF ((int handle, U_CHAR **result));
local ImpErr ct_gencodes
OF ((int handle, int minbits, int maxbits, long *saved));
local ImpErr ct_split
OF ((TRDATA *part, int size, long freq,
int prefix, int preflen, int minbits, int maxbits));
local int ct_fsort
OF ((TRDATA *tr1, TRDATA *tr2));
local int ct_rsort
OF ((RESORT *cr1, RESORT *cr2));
/***********************************************************************
*
* Allocate a code tree.
*/
local
ImpErr
ct_alloc (size, handle)
int size;
int *handle;
{ register TRDATA *ct;
int n;
#ifdef VALIDATE
/* Validate the arguments. */
if (size < 2 || size > 256) goto badarg;
if (handle == NULL) goto badarg;
#endif /* VALIDATE */
/* Allocate an available code tree handle. */
for (n = 0;
n < MAXTREES && ct_table[n].ct_array != NULL;
n++) ;
if (n >= MAXTREES) return IM_NOCTBLS;
*handle = n;
ct_table[n].ct_size = size;
/* Allocate space for the code tree. */
ct = (TRDATA *) malloc ((unsigned) (size * sizeof (TRDATA)));
if (ct == NULL) return IM_NOMEM;
ct_table[n].ct_array = ct;
/* Initialize the code tree. */
for (n = 0; n < size; n++, ct++)
{ ct->ct_freq = 0;
ct->ct_code = 0;
ct->ct_val = (U_CHAR)n;
ct->ct_len = 0;
}
/* That's all. */
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_alloc: bad argument(s)");
return IM_BADARG;
#endif /* VALIDATE */
}
/***********************************************************************
*
* Free a code tree.
*/
local
ImpErr
ct_free (handle)
int handle;
{
#ifdef VALIDATE
/* Validate the argument. */
if (!VALID_HANDLE (handle)) goto badarg;
#endif /* VALIDATE */
/* Free the code tree. */
free ((char *) ct_table[handle].ct_array);
ct_table[handle].ct_array = NULL;
ct_table[handle].ct_size = 0;
/* That's all. */
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_free: bad argument(s)");
return IM_BADARG;
#endif /* VALIDATE */
}
/***********************************************************************
*
* Update a code tree with frequency data.
*
* In order to allow for the later addition of "adaptive" coding,
* the new frequencies are added to the existing data.
*/
local
ImpErr
ct_loadf (handle, freq)
int handle;
long *freq;
{ register long *f;
register TRDATA *ct;
int n;
#ifdef VALIDATE
/* Validate the arguments. */
if (!VALID_HANDLE (handle)) goto badarg;
#endif /* VALIDATE */
/* Add in the frequencies. */
for (f = freq,
ct = ct_table[handle].ct_array,
n = ct_table[handle].ct_size;
n > 0;
f++, ct++, n--)
ct->ct_freq += *f;
/* That's all. */
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_loadf: bad argument(s)");
return IM_BADARG;
#endif /* VALIDATE */
}
/***********************************************************************
*
* Generate the ZIP-file representation for a code tree.
*
* The returned "result" value points to static data which will be
* overwritten on the next call to "ct_ziprep".
*
* The length of the result string is implicit in the first byte of
* the value, as specified in the PKZIP applications note.
*/
#ifdef IMPDEBUG
char *treename;
#endif /* IMPDEBUG */
local
ImpErr
ct_ziprep (handle, result)
int handle;
U_CHAR **result;
{ static U_CHAR buffer[257]; /* result info */
register U_CHAR *c;
register TRDATA *ct;
int s, n, l;
#ifdef VALIDATE
/* Validate the arguments. */
if (!VALID_HANDLE (handle)) goto badarg;
if (result == NULL) goto badarg;
#endif /* VALIDATE */
#ifdef IMPDEBUG
if (treename != NULL && treename[0] != 0)
{ /* Print the code tree info. */
fprintf (stderr, "\n%s tree:\n value len string\n",
treename);
for (ct = ct_table[handle].ct_array,
s = ct_table[handle].ct_size,
n = 0;
s > 0;
ct++, n++, s--)
fprintf (stderr, " %3d (%02x) %2d %04x (rev %04x)\n",
n, n, ct->ct_len,
bi_reverse(ct->ct_code << (16 - ct->ct_len),
ct->ct_len) << (16 - ct->ct_len),
ct->ct_code);
}
#endif /* IMPDEBUG */
/* Generate the returned value. */
for (c = buffer+1,
ct = ct_table[handle].ct_array,
s = ct_table[handle].ct_size,
n = 0,
l = ct->ct_len;
s > 0;
ct++, s--)
{ if (l < 1 || l > 16)
{ fprintf (stderr, "\nError in ct_ziprep: bad code length");
return IM_LOGICERR;
}
if (n >= 16 || (int)ct->ct_len != l)
{ *c++ = (U_CHAR)((((n-1) << 4) & 0xf0) | ((l-1) & 0x0f));
n = 1; l = ct->ct_len;
}
else n++;
}
if (n > 0)
*c++ = (U_CHAR)((((n-1) << 4) & 0xf0) | ((l-1) & 0x0f));
buffer[0] = (U_CHAR)((c - buffer) - 2);
/* That's all. */
*result = buffer;
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_ziprep: bad argument(s)");
return IM_BADARG;
#endif /* VALIDATE */
}
/***********************************************************************
*
* Look up the code string for a given value.
*/
#define ct_lookup(handle, value, string, length) \
{ register TRDATA *ct; \
ct = ct_table[handle].ct_array + (value); \
string = ct->ct_code; \
length = ct->ct_len; \
}
/***********************************************************************
*
* Generate the codes for a code tree. If old codes already exist for
* the tree, they are discarded, and a new set of codes is generated
* from scratch.
* IN assertion: ma_buf is already allocated and can be overwritten.
*/
local
ImpErr
ct_gencodes (handle, minbits, maxbits, saved)
int handle; /* which tree */
int minbits; /* min code string bit length */
int maxbits; /* max code string bit length */
long *saved; /* how many bits saved */
{ register TRDATA *ct;
TRDATA *ct2;
register int n;
UL_INT f;
register RESORT *cr;
int code, srclen;
long totalfreq, totalbits;
ImpErr retcode;
RESORT rbuf[256];
int size; /* alias for ct_table[handle].ct_size */
int z; /* index of zero frequency element */
int nz; /* index of non zero frequency element */
#ifdef VALIDATE
/* Validate the arguments. */
if (!VALID_HANDLE (handle)) goto badarg;
if (minbits < 1) goto badarg;
if (maxbits > 16) goto badarg;
if (maxbits < minbits) goto badarg;
if (saved == NULL) goto badarg;
#endif /* VALIDATE */
size = ct_table[handle].ct_size;
/*
* Start by sorting the data by frequency. The source values with
* higher frequency need to get the shorter Shannon-Fano codes.
* First exclude the elements with zero frequency, to speed up the sort.
* This optimization is very important for small files, otherwise the sort
* takes most of the implode time.
* Also determine the total of all frequencies. This will be needed in
* order to partition the source values in the Shannon-Fano bit
* string computation. Finally clear the "code" (bit string) and "len"
* (bit string length) fields in the code tree array.
*/
totalfreq = 0;
ct = ct_table[handle].ct_array;
/* Copy ct into a tempo */
ct2 = (TRDATA*) ma_buf;
memcpy((char*)ct2, (char*)ct, size * sizeof(TRDATA));
for (nz = 0, z = n = size-1; n >= 0; n--) {
int m;
if (ct2[n].ct_freq != 0L) {
m = nz++;
totalfreq += (ct[m].ct_freq = ct2[n].ct_freq);
} else {
m = z--;
ct[m].ct_freq = 0L;
}
ct[m].ct_code = 0;
ct[m].ct_len = 0;
ct[m].ct_val = (U_CHAR)n; /* ct2[n].ct_val */
}
qsort ((char *) (ct_table[handle].ct_array), nz,
sizeof (TRDATA), (int (*)())ct_fsort);
/*
* Generate the bit strings via a Shannon-Fano (top-down) algorithm.
*/
retcode =
ct_split (ct_table[handle].ct_array, /* partition start */
size, /* partition size */
totalfreq, /* total frequency */
0, /* code string prefix */
0, /* # bits in prefix */
minbits, /* minimum tree depth */
maxbits); /* maximum tree depth */
if (retcode != IM_OK) return retcode;
/*
* The source value 255 needs to be assigned a bit string with a
* length of at least 10, in order to accommodate shortcuts in
* PKUNZIP's decoding algorithm. If no bit string in the tree is
* of length 10, we assign 255 to the longest string and hope for
* the best.
*/
n = size;
if (n == 256)
{ for (ct = ct_table[handle].ct_array;
n > 0 && ct->ct_val != 255;
n--, ct++) ;
if (n == 0)
{ fprintf (stderr, "\nError in ct_gencodes: no value 255");
return IM_LOGICERR;
}
if (ct->ct_len < 10)
{ ct2 = ct;
while (n > 0 && ct->ct_len < 10) n--, ct++;
if (n == 0) ct--; /* no len>=10 in tree; use longest */
n = ct->ct_val;
ct->ct_val = ct2->ct_val;
ct2->ct_val = (U_CHAR)n;
f = ct->ct_freq;
ct->ct_freq = ct2->ct_freq;
ct2->ct_freq = f;
} }
/*
* The source values need to be re-sorted so that all source values
* with code strings of the same length will be in ascending order.
* This is because of the compression scheme used to represent the
* tree in the ZIP file.
*/
for (n = size,
ct = ct_table[handle].ct_array,
cr = rbuf;
n > 0;
n--, ct++, cr++)
{ cr->ct_rlen = ct->ct_len;
cr->ct_rval = ct->ct_val;
}
n = size;
qsort ((char *) rbuf, n, sizeof (RESORT), (int (*)())ct_rsort);
for (ct = ct_table[handle].ct_array,
cr = rbuf;
n > 0;
n--, ct++, cr++)
ct->ct_val = cr->ct_rval;
#ifdef DUMP_TREE
printf ("Finished tree:\n");
for (n = size,
ct = ct_table[handle].ct_array;
n > 0;
n--, ct++)
printf (" %3d (0x%02x) l %2d c 0x%04x f %ld\n",
ct->ct_val, ct->ct_val, ct->ct_len, ct->ct_code, ct->ct_freq);
putchar ('\n');
#endif /* DUMP_TREE */
/*
* Finally, sort the tree back in ascending order by source value
* -- which is the order expected by other portions of the program.
*/
ct = ct_table[handle].ct_array;
/* Copy ct into a tempo */
ct2 = (TRDATA*)ma_buf;
memcpy((char*)ct2, (char*)ct, size * sizeof(TRDATA));
for (n = size-1; n >= 0; n--) {
U_CHAR v = ct2[n].ct_val;
ct[v].ct_freq = ct2[n].ct_freq;
ct[v].ct_code = ct2[n].ct_code;
ct[v].ct_len = ct2[n].ct_len;
ct[v].ct_val = v;
}
/*
* Determine how many bits will be saved if all the source data is
* encoded using this new set of code strings, as opposed to being
* represented directly in unencoded form.
*/
/* # of bits needed for unencoded source values */
n = ct_table[handle].ct_size;
for (code = 1, srclen = 0; code < n; code <<= 1, srclen++) ;
/* # of bits used if all source values encoded via this tree */
for (ct = ct_table[handle].ct_array,
totalbits = 0;
n > 0;
n--, ct++)
totalbits += ct->ct_freq * ct->ct_len;
/* # bits saved by using the tree */
*saved = (totalfreq * srclen) - totalbits;
/* That's all. */
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_gencodes: bad argument(s)");
return IM_BADARG;
#endif /* VALIDATE */
}
/***********************************************************************
*
* Split a portion of a code tree into two pieces of approximately equal
* total frequency. This routine calls itself recursively in order to
* generate the bit strings for the entire code tree.
*/
local
ImpErr
ct_split (part, size, freq, prefix, preflen, minbits, maxbits)
TRDATA *part; /* start of partition */
int size; /* # elements in partition */
long freq; /* sum of frequencies in partition */
int prefix; /* initial code bits for partition */
int preflen; /* # bits in prefix */
int minbits; /* minimum permissible bit length */
int maxbits; /* maximum permissible bit length */
{ register TRDATA *ct;
int topmaxbits, botmaxbits, localminbits;
U_INT topmaxvals, botmaxvals;
int topsize, botsize;
long topfreq, botfreq, halffreq, onefreq;
int n, m, leadzeros;
int maxshort, minlong;
ImpErr retcode;
static maxarray[17] =
{ 8,8,8,8,12,12,14,14,16,16,16,16,16,16,16,16,16 };
#ifdef VALIDATE
if (part == NULL) goto badarg;
if (size < 1) goto badarg;
if (freq < 0) goto badarg;
if (preflen < 0) goto badarg;
if (preflen > maxbits) goto badarg;
if (minbits < preflen) goto badarg;
if (maxbits > 16) goto badarg;
if (maxbits < minbits) goto badarg;
/*
putc ('\n', stderr);
for (n = preflen; n > 0; n--) fprintf (stderr, " ");
fprintf (stderr, "ct_split (sz=%d, pr=%04x[%d], min=%d, max=%d)",
size, prefix, preflen, minbits, maxbits);
*/
#endif /* VALIDATE */
/*
* If there's only one element in this partition, we simply take
* the "prefix" value as the code string for the single element.
* We reverse the bits of the prefix for more efficient output.
*/
if (size == 1)
{ part->ct_code = bi_reverse(prefix, preflen);
part->ct_len = (U_CHAR)preflen;
return IM_OK;
}
/*
* This partition will be divided into two parts. The "top" part
* will have a "1" bit appended to its "prefix" bit string; the
* "bottom" part will have a "0" bit appended to its "prefix".
*
* We need to determine the maximum number of source values which
* may be assigned to the two partitions. The first issue to con-
* sider is that PKUNZIP 1.10's tree-decoding shortcuts require a
* certain number of leading "0" bits in each code string, depending
* on its length. Code strings of 9-12 bits must have at least 4
* leading zeros; strings of 13 or 14 bits, at least 6 leading
* zeros; and strings of 15 or 16 bits, at least 8 leading zeros.
*
* If the "prefix" is zero, the above limitation is used to restrict
* the maximum size of the top half of the partition. The bottom
* half does not need to be restricted in this way, since it can be
* extended as far as needed along the path where the "prefix" grows
* in length but remains all zero.
*/
botmaxbits = maxbits;
if (prefix != 0) topmaxbits = maxbits;
else
{ for (n = 0, leadzeros = 0x8000;
n < preflen && (prefix & leadzeros) == 0;
n++, leadzeros >>= 1) ;
topmaxbits = maxarray[n];
if (topmaxbits > maxbits) topmaxbits = maxbits;
}
if (topmaxbits < minbits)
{ fprintf (stderr, "\nError in ct_split: ");
fprintf (stderr, "topmaxbits(%d) < minbits(%d)",
topmaxbits, minbits);
goto oops;
}
if (botmaxbits < minbits)
{ fprintf (stderr, "\nError in ct_split: ");
fprintf (stderr, "botmaxbits(%d) < minbits(%d)",
botmaxbits, minbits);
goto oops;
}
topmaxvals = 1 << (topmaxbits - preflen - 1);
n = size >> 1; if (topmaxvals > n) topmaxvals = n;
botmaxvals = 1 << (botmaxbits - preflen - 1);
n = size - 1; if (botmaxvals > n) botmaxvals = n;
if (topmaxvals + botmaxvals < size)
{ fprintf (stderr, "\nError in ct_split: ");
fprintf (stderr, "topmaxvals(%d) + botmaxvals(%d) ",
topmaxvals, botmaxvals);
fprintf (stderr, "< size(%d)", size);
goto oops;
}
/*
* We now split the current partition into two halves of as close
* to equal frequency as possible. If the total of all frequencies
* in the partition is zero, split into two halves of equal size.
*/
if (freq == 0)
{ topsize = size >> 1;
ct = part + topsize;
topfreq = 0;
}
else
{ halffreq = freq >> 1; /* half the total frequency */
m = size >> 1; /* half the total elements, */
/* rounded down */
for (topsize = 0, topfreq = 0, ct = part;
topsize < m && topfreq <= halffreq
&& (onefreq = ct->ct_freq) > 0;
topsize++, ct++)
topfreq += onefreq;
if (topsize >= 2)
{ /*
* If moving one element from the top to the bottom parti-
* tion would make the two more closely equal in frequency,
* do it.
*/
onefreq = (ct-1)->ct_freq;
if ((topfreq - halffreq) > (halffreq - (topfreq - onefreq)))
ct--, topsize--, topfreq -= onefreq;
} }
botsize = size - topsize;
botfreq = freq - topfreq;
/* "ct" points to first element in bottom half */
/*
* The above first-cut attempt to split the partition may not work
* for one of two reasons. First, one or the other half may contain
* too many values (more than "topmaxvals" or "botmaxvals").
*/
while (topsize > topmaxvals)
{ onefreq = (--ct)->ct_freq;
topsize--; topfreq -= onefreq;
botsize++; botfreq += onefreq;
}
while (botsize > botmaxvals)
{ onefreq = (ct++)->ct_freq;
topsize++; topfreq += onefreq;
botsize--; botfreq -= onefreq;
}
/*
* Second, the number of bits required to represent the values in
* each half may violate PKZIP's requirement (implicit in the way
* trees are compressed in an imploded file) that no code string in
* the top half may be longer than any code string in the bottom
* half.
*/
localminbits = preflen + 1;
if (localminbits < minbits) localminbits = minbits;
for (;;)
{ for (maxshort = preflen + 1, n = 1;
n < botsize;
maxshort++, n <<= 1) ;
if (n > botsize) maxshort--;
if (maxshort < localminbits) maxshort = localminbits;
if (maxshort > topmaxbits) maxshort = topmaxbits;
for (minlong = preflen + 1, n = 1;
n < topsize;
minlong++, n <<= 1) ;
if (minlong <= maxshort) break;
onefreq = (--ct)->ct_freq;
topsize--; topfreq -= onefreq;
botsize++; botfreq += onefreq;
}
/*
* Third, the number of elements in the top half must be enough to
* result in each string having at least "minbits" bits in all.
*/
n = 1 << (minbits - preflen - 1);
while (topsize < n)
{ onefreq = (ct++)->ct_freq;
topsize++; topfreq += onefreq;
botsize--; botfreq -= onefreq;
}
/*
* Now that the sizes of the two halves of the partition have been
* finalized, process the top and bottom halves via recursion.
*/
retcode = ct_split (part, topsize, topfreq,
prefix | (1 << (15-preflen)),
preflen + 1, localminbits, maxshort);
if (retcode != IM_OK) return retcode;
ct = part + topsize;
retcode = ct_split (ct, botsize, botfreq,
prefix, preflen + 1, (int)ct[-1].ct_len, maxbits);
if (retcode != IM_OK) return retcode;
/* That's all. */
return IM_OK;
#ifdef VALIDATE
badarg:
fprintf (stderr, "\nError in ct_split: bad argument(s)");
putchar ('\n'); fflush (stdout); fflush (stderr);
/* abort (); */
return IM_BADARG;
#endif /* VALIDATE */
oops:
#ifdef VALIDATE
putchar ('\n'); fflush (stdout); fflush (stderr);
/* abort (); */
#endif /* VALIDATE */
return IM_LOGICERR;
}
/***********************************************************************
*
* Sorting function -- descending order by source value frequency.
*
* This sorting function is used at the start of the code tree con-
* struction process, before the bit string values are assigned.
* To ensure consistent behaviour on all machines, we use the source
* values as secondary sort key, but this is not mandatory.
*/
local
int
ct_fsort (tr1, tr2)
TRDATA *tr1, *tr2;
{ long d;
int v;
d = (long) tr1->ct_freq - (long) tr2->ct_freq;
if (d < 0) return 1;
if (d > 0) return -1;
v = (int) tr1->ct_val - (int) tr2->ct_val;
if (v < 0) return 1;
if (v > 0) return -1;
return 0;
}
/***********************************************************************
*
* Sorting function -- ascending order by bit string length; if lengths
* are the same, ascending order by source value.
*
* This sorting function is used after the bit string values have been
* assigned.
*/
local
int
ct_rsort (cr1, cr2)
RESORT *cr1, *cr2;
{ int d;
d = (int) cr1->ct_rlen - (int) cr2->ct_rlen;
if (d > 0) return 1;
if (d < 0) return -1;
d = (int) cr1->ct_rval - (int) cr2->ct_rval;
if (d > 0) return 1;
if (d < 0) return -1;
return 0;
}
/***********************************************************************
*
* Allocate the code trees.
*/
ImpErr
ct_init ()
{ ImpErr retcode;
int i;
#ifdef DEBUG
if (256*sizeof(TRDATA) > MA_BUFSIZE*sizeof(MATCH)) return IM_LOGICERR;
#endif /* DEBUG */
retcode = ct_windup ();
if (retcode != IM_OK) return retcode;
ct_litc_num = 0;
ct_lit2_num = 0;
ct_strg_num = 0;
for (i = 255; i >= 0; i--)
ct_litc_freq[i] = 0;
for (i = 63; i >= 0; i--)
ct_len2_freq[i] = 0, ct_len3_freq[i] = 0,
ct_dst2_freq[i] = 0, ct_dst3_freq[i] = 0;
retcode = ct_alloc (256, &ct_litc_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_alloc (64, &ct_len2_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_alloc (64, &ct_len3_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_alloc (64, &ct_dst2_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_alloc (64, &ct_dst3_tree);
if (retcode != IM_OK) return retcode;
return IM_OK;
}
/***********************************************************************
*
* Tally a "string match" data set. The tally results will be used to
* determine how large the imploded result will be.
*/
ImpErr
ct_tally (ma)
MATCH *ma; /* match data to write out */
{ register int ch;
int dist = ma->ma_dist;
/* Tally up the latest data. */
if (dist == 0) { /* literal character */
ct_litc_num++;
ch = ma->l.ma_litc[0];
ct_litc_freq[ch]++;
} else if (dist < 0) { /* 2-character match */
ct_lit2_num++;
ch = ma->l.ma_litc[0];
ct_litc_freq[ch]++;
ch = ma->l.ma_litc[1];
ct_litc_freq[ch]++;
ch = ((-dist-1) >> fd.fd_nbits) & 0x3f;
ct_dst2_freq[ch]++;
ct_len2_freq[0]++;
} else { /* 3-char or longer match */
ct_strg_num++;
ch = ((dist-1) >> fd.fd_nbits) & 0x3f;
ct_dst3_freq[ch]++;
/* We defer the update of ct_dst2_freq and ct_len2_freq until
* ct_mktrees:
* ct_dst2_freq[ch]++;
* ch = ma->l.ma_length - 2;
* if (ch > 63) ch = 63;
* ct_len2_freq[ch]++;
*/
ch = ma->l.ma_length - 3;
if (ch > 63) ch = 63;
ct_len3_freq[ch]++;
}
/* That's all. */
return IM_OK;
}
/***********************************************************************
*
* Construct all code trees, and then determine how big the imploded
* file will be under both "literal tree" and "no literal tree" con-
* ditions. Choose the best option.
*/
ImpErr
ct_mktrees ()
{ U_CHAR *c;
ImpErr retcode;
register long sum;
long len2, len3;
int n;
/* ct_tally did not update ct_dst2_freq and ct_len2_freq for matches of
* length > 2, so correct this now.
*/
for (n = 62; n >= 0; n--) {
ct_dst2_freq[n] += ct_dst3_freq[n];
ct_len2_freq[n+1] += ct_len3_freq[n];
}
ct_dst2_freq[63] += ct_dst3_freq[63];
ct_len2_freq[63] += ct_len3_freq[63];
/*
* Construct the code trees and see how much space each will save.
*
* It is conceivable that a tree could result in a negative savings
* if its compressed form is sufficiently long.
*
* We need to construct the ZIP-file compressed representation of
* each tree in order to figure out how much space it will take.
* However, we don't save these tree representations now; rather,
* we'll wait until later and reconstruct the representations for
* whichever two (or three) trees we really need for the output.
*/
#ifdef IMPDEBUG
treename = (char *)NULL;
#endif /* IMPDEBUG */
/* literal code tree */
retcode = ct_loadf (ct_litc_tree, ct_litc_freq);
if (retcode != IM_OK) return retcode;
retcode = ct_gencodes (ct_litc_tree, 1, 16, &ct_litc_saved);
if (retcode != IM_OK) return retcode;
retcode = ct_ziprep (ct_litc_tree, &c);
if (retcode != IM_OK) return retcode;
ct_litc_saved -= (int) (c[0]+2) * 8;
/* length code tree (2) */
retcode = ct_loadf (ct_len2_tree, ct_len2_freq);
if (retcode != IM_OK) return retcode;
retcode = ct_gencodes (ct_len2_tree, 1, 16, &ct_len2_saved);
if (retcode != IM_OK) return retcode;
retcode = ct_ziprep (ct_len2_tree, &c);
if (retcode != IM_OK) return retcode;
ct_len2_saved -= (int) (c[0]+2) * 8;
/* length code tree (3) */
retcode = ct_loadf (ct_len3_tree, ct_len3_freq);
if (retcode != IM_OK) return retcode;
retcode = ct_gencodes (ct_len3_tree, 1, 16, &ct_len3_saved);
if (retcode != IM_OK) return retcode;
retcode = ct_ziprep (ct_len3_tree, &c);
if (retcode != IM_OK) return retcode;
ct_len3_saved -= (int) (c[0]+2) * 8;
/* distance code tree (2) */
retcode = ct_loadf (ct_dst2_tree, ct_dst2_freq);
if (retcode != IM_OK) return retcode;
retcode = ct_gencodes (ct_dst2_tree, 1, 8, &ct_dst2_saved);
if (retcode != IM_OK) return retcode;
retcode = ct_ziprep (ct_dst2_tree, &c);
if (retcode != IM_OK) return retcode;
ct_dst2_saved -= (int) (c[0]+2) * 8;
/* distance code tree (3) */
retcode = ct_loadf (ct_dst3_tree, ct_dst3_freq);
if (retcode != IM_OK) return retcode;
retcode = ct_gencodes (ct_dst3_tree, 1, 8, &ct_dst3_saved);
if (retcode != IM_OK) return retcode;
retcode = ct_ziprep (ct_dst3_tree, &c);
if (retcode != IM_OK) return retcode;
ct_dst3_saved -= (int) (c[0]+2) * 8;
/*
* Determine how big the compressed file will be
* with, and without, a literal character tree.
*/
/* compressed length (no literal tree) */
sum = ct_litc_num + ct_lit2_num + ct_strg_num; /* initial bit */
sum += ct_litc_num * 8; /* literal bytes */
sum += (ct_lit2_num+ct_strg_num) * 6 - ct_len2_saved; /* lengths */
sum += 8 * ct_len2_freq[63]; /* oversize lengths */
sum += (ct_lit2_num+ct_strg_num) * (fd.fd_nbits+6)
- ct_dst2_saved; /* distances */
len2 = (sum+7) / 8; /* convert to bytes */
/* compressed length (with literal tree) */
sum = ct_litc_num + 2*ct_lit2_num + ct_strg_num; /* initial bit */
sum += (ct_litc_num+2*ct_lit2_num)*8 - ct_litc_saved;/* lit bytes */
sum += ct_strg_num * 6 - ct_len3_saved; /* lengths */
sum += 8 * ct_len3_freq[63]; /* oversize lengths */
sum += ct_strg_num * (fd.fd_nbits+6) - ct_dst3_saved; /* dist's */
len3 = (sum+7) / 8; /* convert to bytes */
/*
* PKUNZIP 1.10 requires that the source value 255 in a "literal"
* tree must be represented by a bit string of length >= 10. The
* literal tree was already adjusted to ensure that the value 255
* was given a bit string of length 10 or greater if possible. If
* this did not succeed -- which would only happen if the longest
* bit string in the literal tree were of length 8 or 9 -- then the
* literal tree cannot be used. In such a case, not much would be
* gained by using it anyway, so there's little reason to be upset.
*/
if (ct_table[ct_litc_tree].ct_array[255].ct_len < 10)
len3 = len2;
/*
* Choose the method of compression which will use the least space
* for this particular file. The possibilities are: use a literal
* character tree; or, don't use a literal character tree.
*/
if (len2 <= len3)
{ fd.fd_method = NO_LITERAL_TREE;
fd.fd_clen = len2;
lit_tree = -1;
len_tree = ct_len2_tree;
dst_tree = ct_dst2_tree;
retcode = ct_free (ct_litc_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_free (ct_dst3_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_free (ct_len3_tree);
if (retcode != IM_OK) return retcode;
}
else
{ fd.fd_method = LITERAL_TREE;
fd.fd_clen = len3;
lit_tree = ct_litc_tree;
len_tree = ct_len3_tree;
dst_tree = ct_dst3_tree;
retcode = ct_free (ct_dst2_tree);
if (retcode != IM_OK) return retcode;
retcode = ct_free (ct_len2_tree);
if (retcode != IM_OK) return retcode;
}
/* That's all. */
return IM_OK;
}
/***********************************************************************
*
* Output the code trees.
*/
ImpErr
ct_wrtrees (outfp)
FILE *outfp; /* output file */
{ ImpErr retcode;
U_CHAR *c;
/* Output the literal tree, if any. */
#ifdef IMPDEBUG
treename = "Literal";
#endif /* IMPDEBUG */
if (lit_tree >= 0)
{ retcode = ct_ziprep (lit_tree, &c);
if (retcode != IM_OK) return retcode;
if (zfwrite ((char *) c, (int) (c[0]+2), 1, outfp) != 1)
return IM_IOERR;
}
/* Output the length tree. */
#ifdef IMPDEBUG
treename = "Length";
#endif /* IMPDEBUG */
retcode = ct_ziprep (len_tree, &c);
if (retcode != IM_OK) return retcode;
if (zfwrite ((char *) c, (int) (c[0]+2), 1, outfp) != 1)
return IM_IOERR;
/* Output the distance tree. */
#ifdef IMPDEBUG
treename = "Distance";
#endif /* IMPDEBUG */
retcode = ct_ziprep (dst_tree, &c);
if (retcode != IM_OK) return retcode;
if (zfwrite ((char *) c, (int) (c[0]+2), 1, outfp) != 1)
return IM_IOERR;
return IM_OK;
}
/* Macros for outputting bit string. */
#define OUTBITS(value,length) \
{ retcode = bi_rlout ((int) (value), (int) (length)); \
if (retcode != IM_OK) return retcode; \
}
#define OUTCODE(value,tree) \
{ ct_lookup (tree, value, bitstring, bitlength); \
retcode = bi_rlout (bitstring, bitlength); \
if (retcode != IM_OK) return retcode; \
}
/***********************************************************************
*
* Output the body of the file in imploded form.
*/
ImpErr
ct_wrdata (outfp)
FILE *outfp; /* output (ZIP) file */
{ MATCH *ma;
ImpErr retcode;
register int minmatch;
int bitstring, bitlength;
int bitmask = (1 << (fd.fd_nbits+1))-1;
/* Used to select the bottom 6 or 7 bits of a distance, which are
* output literally, plus 1 bit marking a distance
*/
int matches;
#ifdef IMPDEBUG
long srcpos;
#endif /* IMPDEBUG */
/* Determine the minimum match length. */
minmatch = (lit_tree >= 0) ? 3 : 2;
/* Prepare the I/O. */
if (tflush (fd.fd_temp) != 0) return IM_IOERR;
trewind (fd.fd_temp);
retcode = bi_init (outfp);
if (retcode != IM_OK) return retcode;
#ifdef IMPDEBUG
srcpos = 0;
fprintf (stderr, "\nImploded output:\n");
#endif /* IMPDEBUG */
/* Read and process data from the temporary file. */
while ((matches =
tread ((char *) ma_buf, sizeof(MATCH), MA_BUFSIZE, fd.fd_temp)) > 0)
for (ma = ma_buf; matches > 0; ma++, matches--)
{
int dist = ma->ma_dist;
int len = 0;
#ifdef IMPDEBUG
fprintf (stderr, "%8ld: ", srcpos);
#endif /* IMPDEBUG */
if (dist < 0) {
dist = -dist, len = 2;
} else if (dist > 0) {
len = ma->l.ma_length;
}
/* Output distance and length if enough characters match. */
if (len >= minmatch)
{ /* "matched string" header bit (0) */
#ifdef IMPDEBUG
fprintf (stderr, "str (dst=%d,len=%d) ",
dist, len);
srcpos += len;
#endif /* IMPDEBUG */
/* ouput one zero bit then the distance */
dist--;
OUTBITS ((dist << 1) & bitmask, fd.fd_nbits + 1);
OUTCODE (dist >> fd.fd_nbits, dst_tree);
/* length -- depends on how it compares to maximum */
len -= minmatch;
if (len >= 63)
{ /* big length -- output code for 63, then surplus */
OUTCODE (63, len_tree);
OUTBITS ((len - 63), 8);
}
else
{ /* small length -- output code */
OUTCODE (len, len_tree);
} }
else if (lit_tree >= 0)
{ /* first or single literal -- header bit (1) plus char */
#ifdef IMPDEBUG
fprintf (stderr, "lit (val=%02x) ",
ma->l.ma_litc[0] & 0xff);
srcpos++;
#endif /* IMPDEBUG */
OUTBITS (1, 1);
OUTCODE (ma->l.ma_litc[0], lit_tree);
if (len == 2)
{ /* second literal -- header bit (1) plus char */
#ifdef IMPDEBUG
fprintf (stderr, "\n%8ld: lit (val=%02x) ",
srcpos, ma->l.ma_litc[1] & 0xff);
srcpos++;
#endif /* IMPDEBUG */
OUTBITS (1, 1);
OUTCODE (ma->l.ma_litc[1], lit_tree);
} }
else
{ /* single literal -- header bit (1) plus char */
#ifdef IMPDEBUG
fprintf (stderr, "lit (val=%02x) ",
ma->l.ma_litc[0] & 0xff);
srcpos++;
#endif /* IMPDEBUG */
OUTBITS ((ma->l.ma_litc[0] << 1) + 1, 9);
}
#ifdef IMPDEBUG
putc ('\n', stderr);
#endif /* IMPDEBUG */
}
/* Make sure we hit EOF on input without an error. */
if (terror (fd.fd_temp)
#ifndef MINIX
#ifndef __TURBOC__ /* TurboC 2.0 does not set the EOF flag (?) */
|| !teof (fd.fd_temp)
#endif /* !__TURBOC__ */
#endif /* !MINIX */
)
return IM_IOERR;
retcode = bi_windup ();
if (retcode != IM_OK) return retcode;
/* That's all. */
return IM_OK;
}
#undef OUTBITS
#undef OUTCODE
/***********************************************************************
*
* Deallocate all code trees.
*/
ImpErr
ct_windup ()
{ int n;
static windup_already_called = 0;
ImpErr retcode;
if (windup_already_called)
{ /* Discard any old code trees. */
for (n = 0; n < MAXTREES; n++)
{ if (ct_table[n].ct_array != NULL)
{ retcode = ct_free (n);
if (retcode != IM_OK) return retcode;
} } }
else
{ /* Initialize the list of active code trees. */
for (n = 0; n < MAXTREES; n++)
{ ct_table[n].ct_array = NULL;
ct_table[n].ct_size = 0;
}
windup_already_called = 1;
}
/* That's all. */
return IM_OK;
}
/**********************************************************************/